import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.io as pio
pio.renderers.default='notebook'
import plotly.express as px
# Main data
url_src = "data/LBNL_file.csv"
df = pd.read_csv(url_src, low_memory=False, parse_dates=True)
# cols = ['state','city','zip_code','customer_segment','self_installed','system_size_DC','installation_date']
# df = df[cols]
df = df.replace(-9999,np.nan)
r = {"COM": "NON-RES","NON-PROFIT": "NON-RES","SCHOOL": "NON-RES",'GOV': 'NON-RES'}
df['customer_segment'] = df['customer_segment'].replace(r, regex=True)
# Remove too low price data to minimice errors
df = df.loc[df.total_installed_price > 100]
df['city'] = df['city'].str.upper()
df = df.loc[df['system_size_DC'] != 0]
# Only California
df = df.loc[df['state'] == 'CA']
df['installation_date'] = pd.to_datetime(df['installation_date'])
df = df.sort_values(by='installation_date')
# Luodaan vuodet
df['year'] = df['installation_date'].dt.year
# Alku vuodet pois (Vähän dataa)
df = df.loc[df['year'] >= 2015]
pd.set_option("display.max_columns", None)
# Remove data that have additional modules or batteries to get comparable data
df = df.loc[df.efficiency_module_2.isna()]
df = df.loc[df.efficiency_module_3.isna()]
df = df[df.total_installed_price.notna()]
df = df.loc[df.battery_rated_capacity_kWh.isna()]
df = df.loc[df.battery_rated_capacity_kW.isna()]
df = df[df.columns.drop(list(df.filter(regex='_2')))]
df = df[df.columns.drop(list(df.filter(regex='_3')))]
df = df.loc[df.installer_name != 'Tesla Energy']
df = df.drop(['battery_rated_capacity_kWh', 'battery_rated_capacity_kW','installer_name',
'system_ID_1', 'third_party_owned', 'state', 'new_construction','zip_code'], axis=1)
df = df.loc[df['rebate_or_grant'] == 0]
df = df.drop(['rebate_or_grant', 'tilt_1', 'azimuth_1', 'ground_mounted'], axis=1)
# Create RES residential column (other data is non residential)
df.loc[df['customer_segment'] == 'RES','RES'] = 1
df['RES'] = df['RES'].replace(np.nan,0)
# Still some additional modules remove
df = df.loc[df.solar_storage_hybrid_inverter_1 == 0]
df = df.loc[df.additional_modules == 0]
df = df.loc[df.bifacial_module_1 == 0]
df = df.loc[df.technology_module_1 != '-9999']
df = df.drop(['additional_modules', 'bifacial_module_1'], axis=1)
df = df.loc[df.additional_inverters == 0]
# Replace data
df = df.replace(np.nan,0)
df = df.replace('-9999','Unknown')
df = df.loc[df.inverter_model_1 != 'Unknown']
df = df.loc[df.BIPV_module_1 == 0]
df = df.drop(['BIPV_module_1'], axis=1)
# Create multiplier to get stars
df['x'] = round(df['total_installed_price']/df['system_size_DC']*df['efficiency_module_1'],2)
df = df.loc[df.x > 1]
# Remove self installed to get more coparable data
df = df.loc[df.self_installed == 0]
df = df.loc[df.expansion_system == 0]
df = df.loc[df.multiple_phase_system == 0]
# Drop useless cols
df = df.drop(['self_installed', 'expansion_system', 'multiple_phase_system', 'customer_segment',
'utility_service_territory','data_provider_1', 'additional_inverters',
'city', 'installation_date', 'tracking','solar_storage_hybrid_inverter_1'], axis=1)
df = df.reset_index()
df = df.drop(['index'], axis=1)
# Create stars by cut function
xmin = df.x.min()
x1 = round(df.x.mean()*0.6666,2)
x2 = round(df.x.mean()*1.3333,2)
xmax = df.x.max()
xbin = [xmin,x1,x2,xmax]
df['stars'] = pd.cut(df.x,
bins=xbin,
labels=[1,2,3]
)
df.stars = df.stars.astype(float)
# -----DF2 (model)-----
df2 = df.copy()
# Categors
df2['module_manufacturer_1'] = df2['module_manufacturer_1'].astype('category').cat.codes
df2['module_model_1'] = df2['module_model_1'].astype('category').cat.codes
df2['technology_module_1'] = df2['technology_module_1'].astype('category').cat.codes
df2['inverter_manufacturer_1'] = df2['inverter_manufacturer_1'].astype('category').cat.codes
df2['inverter_model_1'] = df2['inverter_model_1'].astype('category').cat.codes
print('Shape:',df.shape)
print('Shape:',df2.shape)
print('Cut bin limits:',xbin)